Imports¶

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from ydata_profiling import ProfileReport
from sklearn.preprocessing import OrdinalEncoder
from scipy import stats

Load data¶

In [2]:
DATA = pd.read_csv('data/input.csv')
In [3]:
%matplotlib inline
profile = ProfileReport(DATA, title="report")
profile
Out[3]:

In [4]:
categorical_cols = DATA.select_dtypes(include=['object', 'category']).columns
categorical_cols = [c for c in categorical_cols if c != 'y']
numerical_cols = DATA.select_dtypes(include=['number']).columns
In [5]:
print('Columns categorical: ' + ', '.join(categorical_cols))
Columns categorical: Gender, family_history_with_overweight, FAVC, CAEC, SMOKE, SCC, CALC, MTRANS
In [6]:
print('Columns nuemrical: ' + ', '.join(numerical_cols))
Columns nuemrical: Age, FCVC, NCP, CH2O, FAF, TUE
In [7]:
DATA[numerical_cols].describe()
Out[7]:
Age FCVC NCP CH2O FAF TUE
count 2111.000000 2111.000000 2111.000000 2111.000000 2111.000000 2111.000000
mean 24.312600 2.419043 2.685628 2.008011 1.010298 0.657866
std 6.345968 0.533927 0.778039 0.612953 0.850592 0.608927
min 14.000000 1.000000 1.000000 1.000000 0.000000 0.000000
25% 19.947192 2.000000 2.658738 1.584812 0.124505 0.000000
50% 22.777890 2.385502 3.000000 2.000000 1.000000 0.625350
75% 26.000000 3.000000 3.000000 2.477420 1.666678 1.000000
max 61.000000 3.000000 4.000000 3.000000 3.000000 2.000000
In [8]:
def create_corr_matrix(data):
    corr = data.corr()
    fig, ax = plt.subplots()
    im = ax.imshow(corr, cmap='coolwarm')
    ax.set_xticks(np.arange(len(corr.columns)))
    ax.set_yticks(np.arange(len(corr.columns)))
    ax.set_xticklabels(corr.columns)
    ax.set_yticklabels(corr.columns)
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    for i in range(len(corr.columns)):
        for j in range(len(corr.columns)):
            text = ax.text(j, i, round(corr.iloc[i, j], 2),
                           ha="center", va="center", color="w")
    plt.colorbar(im)
    plt.show()

create_corr_matrix(DATA[numerical_cols])
No description has been provided for this image
In [9]:
for c in numerical_cols:
    sns.displot(DATA, x=c, hue="y", kde=False, height=5, aspect=2)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [10]:
def plot_dim_reduced(x, reductor, title, y_label):
    x = x.drop(y_label, axis=1)
    x = pd.DataFrame(reductor.fit_transform(x), columns=[f'{title}1', f'{title}2'])
    x['y'] = DATA[y_label]
    sns.lmplot(data=x, x=f'{title}1', y=f'{title}2', hue=y_label, fit_reg=False)
    plt.show()

all_numerical_data = pd.DataFrame(OrdinalEncoder().fit_transform(DATA), columns=DATA.columns)
plot_dim_reduced(all_numerical_data, PCA(n_components=2), 'PCA', 'y')
plot_dim_reduced(all_numerical_data, TSNE(n_components=2, random_state=0), 'PCA', 'y')
No description has been provided for this image
No description has been provided for this image
In [11]:
pca = PCA(n_components=8).fit(all_numerical_data)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.gcf().set_size_inches(7, 5)
No description has been provided for this image
In [12]:
n_df = DATA.select_dtypes(include='number')
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
ys = encoder.fit_transform(DATA)[:,-1]
n_df['y'] = ys
In [13]:
n_df
Out[13]:
Age FCVC NCP CH2O FAF TUE y
0 21.000000 2.0 3.0 2.000000 0.000000 1.000000 0.0
1 21.000000 3.0 3.0 3.000000 3.000000 0.000000 0.0
2 23.000000 2.0 3.0 2.000000 2.000000 1.000000 0.0
3 27.000000 3.0 3.0 2.000000 2.000000 0.000000 4.0
4 22.000000 2.0 1.0 2.000000 0.000000 0.000000 5.0
... ... ... ... ... ... ... ...
2106 20.976842 3.0 3.0 1.728139 1.676269 0.906247 3.0
2107 21.982942 3.0 3.0 2.005130 1.341390 0.599270 3.0
2108 22.524036 3.0 3.0 2.054193 1.414209 0.646288 3.0
2109 24.361936 3.0 3.0 2.852339 1.139107 0.586035 3.0
2110 23.664709 3.0 3.0 2.863513 1.026452 0.714137 3.0

2111 rows × 7 columns

In [14]:
sns.pairplot(n_df, hue='y')
Out[14]:
<seaborn.axisgrid.PairGrid at 0x713c0c3c5410>
No description has been provided for this image
In [15]:
# Outlier analysis
n_df_no_y = n_df.drop(['y'], axis=1)
z = np.abs(stats.zscore(n_df_no_y))
(z > 2).sum()
Out[15]:
Age     144
FCVC     82
NCP     243
CH2O      0
FAF      99
TUE     144
dtype: int64
In [16]:
n_df_no_y[z > 2]['Age'].dropna().sort_values()
Out[16]:
1571    37.056193
1488    37.063599
1607    37.084742
1515    37.186795
1107    37.205173
          ...    
1158    55.022494
1088    55.137881
1013    55.246250
252     56.000000
133     61.000000
Name: Age, Length: 144, dtype: float64
In [17]:
n_df_no_y[z <= 2]['Age'].dropna().sort_values()
Out[17]:
415     14.000000
116     15.000000
276     16.000000
302     16.000000
309     16.000000
          ...    
760     36.769646
1700    36.839761
368     37.000000
362     37.000000
387     37.000000
Name: Age, Length: 1967, dtype: float64
In [18]:
n_df_no_y[z > 2]['FCVC'].dropna().sort_values()
Out[18]:
23      1.000000
479     1.000000
473     1.000000
449     1.000000
419     1.000000
          ...   
1006    1.317729
876     1.321028
1501    1.330700
1074    1.341380
575     1.344854
Name: FCVC, Length: 82, dtype: float64
In [19]:
n_df_no_y[z <= 2]['FCVC'].dropna().sort_values()
Out[19]:
1238    1.362441
1499    1.368978
1527    1.369529
1602    1.387489
1528    1.392665
          ...   
417     3.000000
418     3.000000
420     3.000000
803     3.000000
2110    3.000000
Name: FCVC, Length: 2029, dtype: float64
In [20]:
n_df_no_y[z > 2]['NCP'].dropna().sort_values()
Out[20]:
4       1.000000
674     1.000000
675     1.000000
739     1.000000
741     1.000000
          ...   
1556    1.109956
1319    1.114564
1424    1.116401
1780    1.120102
987     1.124977
Name: NCP, Length: 243, dtype: float64
In [21]:
n_df_no_y[z <= 2]['NCP'].dropna().sort_values()
Out[21]:
1437    1.130751
800     1.131695
1779    1.134042
1591    1.134321
1417    1.135278
          ...   
702     4.000000
122     4.000000
397     4.000000
481     4.000000
420     4.000000
Name: NCP, Length: 1868, dtype: float64
In [22]:
n_df_no_y[z > 2]['FAF'].dropna().sort_values()
Out[22]:
1065    2.721646
927     2.724300
926     2.762711
579     2.784471
1424    2.787319
          ...   
247     3.000000
241     3.000000
239     3.000000
218     3.000000
356     3.000000
Name: FAF, Length: 99, dtype: float64
In [23]:
n_df_no_y[z <= 2]['FAF'].dropna().sort_values()
Out[23]:
0       0.000000
440     0.000000
439     0.000000
1025    0.000000
436     0.000000
          ...   
715     2.697949
1236    2.698874
775     2.707882
1018    2.708250
1319    2.710338
Name: FAF, Length: 2012, dtype: float64
In [24]:
n_df_no_y[z > 2]['TUE'].dropna().sort_values()
Out[24]:
1574    1.875683
1276    1.882539
639     1.884138
1105    1.886855
786     1.887386
          ...   
350     2.000000
352     2.000000
354     2.000000
334     2.000000
484     2.000000
Name: TUE, Length: 144, dtype: float64
In [25]:
n_df_no_y[z <= 2]['TUE'].dropna().sort_values()
Out[25]:
1228    0.000000
476     0.000000
1618    0.000000
1025    0.000000
1619    0.000000
          ...   
560     1.839862
696     1.843830
1367    1.865851
1526    1.875023
1548    1.875023
Name: TUE, Length: 1967, dtype: float64